Winealysis - Exploring the world of good wines

1. Import required libraries & load dataset

# Disable scientific notations for numbers (1e5 for example should be displayed as 100000)
options(scipen = 999)

# Define basic paths
path_scripts <- "../scripts_r/"
path_data <- "../data/"


# Load scripts
source(paste0(path_scripts, "load_packages.R"))


# Load dataset
list.files(path_data)
## [1] "winemag-data_first150k.csv" "winemag-data-130k-v2.csv"  
## [3] "winemag-data-130k-v2.json"
data <- read.csv(paste0(path_data, "winemag-data-130k-v2.csv"))

2. Dataset quick summary (counts, summary, data types, structure)

# Let's explore data dimensions
cat('Dataset dimensions:\n')
## Dataset dimensions:
cat(paste0("Rows: ", dim(data)[1], "\n"))
## Rows: 129971
cat(paste0("Columns: ", dim(data)[2], "\n"))
## Columns: 14
# NULL values (total & for each column)
cat(paste0("NULL values count: ", sum(is.na(data)), "\n"))
## NULL values count: 8996
cat("NULL values on each column:\n")
## NULL values on each column:
sapply(data, function(col) sum(is.na(col)))
##                     X               country           description 
##                     0                     0                     0 
##           designation                points                 price 
##                     0                     0                  8996 
##              province              region_1              region_2 
##                     0                     0                     0 
##           taster_name taster_twitter_handle                 title 
##                     0                     0                     0 
##               variety                winery 
##                     0                     0
# Print small data summary
cat('Summary:\n')
## Summary:
data %>% summary()
##        X            country          description        designation       
##  Min.   :     0   Length:129971      Length:129971      Length:129971     
##  1st Qu.: 32492   Class :character   Class :character   Class :character  
##  Median : 64985   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 64985                                                           
##  3rd Qu.: 97478                                                           
##  Max.   :129970                                                           
##                                                                           
##      points           price           province           region_1        
##  Min.   : 80.00   Min.   :   4.00   Length:129971      Length:129971     
##  1st Qu.: 86.00   1st Qu.:  17.00   Class :character   Class :character  
##  Median : 88.00   Median :  25.00   Mode  :character   Mode  :character  
##  Mean   : 88.45   Mean   :  35.36                                        
##  3rd Qu.: 91.00   3rd Qu.:  42.00                                        
##  Max.   :100.00   Max.   :3300.00                                        
##                   NA's   :8996                                           
##    region_2         taster_name        taster_twitter_handle    title          
##  Length:129971      Length:129971      Length:129971         Length:129971     
##  Class :character   Class :character   Class :character      Class :character  
##  Mode  :character   Mode  :character   Mode  :character      Mode  :character  
##                                                                                
##                                                                                
##                                                                                
##                                                                                
##    variety             winery         
##  Length:129971      Length:129971     
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
##                                       
## 
data %>% glimpse()
## Rows: 129,971
## Columns: 14
## $ X                     <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1…
## $ country               <chr> "Italy", "Portugal", "US", "US", "US", "Spain",…
## $ description           <chr> "Aromas include tropical fruit, broom, brimston…
## $ designation           <chr> "Vulkà Bianco", "Avidagos", "", "Reserve Late H…
## $ points                <int> 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87,…
## $ price                 <dbl> NA, 15, 14, 13, 65, 15, 16, 24, 12, 27, 19, 30,…
## $ province              <chr> "Sicily & Sardinia", "Douro", "Oregon", "Michig…
## $ region_1              <chr> "Etna", "", "Willamette Valley", "Lake Michigan…
## $ region_2              <chr> "", "", "Willamette Valley", "", "Willamette Va…
## $ taster_name           <chr> "Kerin O’Keefe", "Roger Voss", "Paul Gregutt", …
## $ taster_twitter_handle <chr> "@kerinokeefe", "@vossroger", "@paulgwine ", ""…
## $ title                 <chr> "Nicosia 2013 Vulkà Bianco  (Etna)", "Quinta do…
## $ variety               <chr> "White Blend", "Portuguese Red", "Pinot Gris", …
## $ winery                <chr> "Nicosia", "Quinta dos Avidagos", "Rainstorm", …
# Also extract columns data types by applying the "class" function on each column
cat('Columns data types:\n')
## Columns data types:
sapply(data, class)
##                     X               country           description 
##             "integer"           "character"           "character" 
##           designation                points                 price 
##           "character"             "integer"             "numeric" 
##              province              region_1              region_2 
##           "character"           "character"           "character" 
##           taster_name taster_twitter_handle                 title 
##           "character"           "character"           "character" 
##               variety                winery 
##           "character"           "character"
# Explore our two main numerical variables (points & price)

# Wine prices
# Long range of prices, but concentrated on the lower pricing tier
wines_price_distribution <- data %>% 
  ggplot(aes(x = price)) +
  geom_histogram(aes(y = ..density..), fill = 'red', colour = 'darkred', bins = 100) +
  geom_vline(aes(xintercept = mean(points)), color = "darkblue", linetype="dashed", size = 1) + 
  theme_minimal() +
  labs(title = 'Wines Price Distribution - Price & Probability + Mean', x = 'X = Price', y = 'Y = Probability')

ggplotly(wines_price_distribution)
## Warning: Removed 8996 rows containing non-finite values (stat_bin).
# Wine points
# Integers, between 80 and 100
wines_points_distribution <- data %>% 
  ggplot(aes(x = points)) +
  geom_histogram(aes(y = ..density..), fill = 'lightblue', colour = "darkblue", bins = 21, template = "seaborn") +
  geom_vline(aes(xintercept = mean(points)), color = "darkred", linetype="dashed", size = 1) + 
  theme_minimal() +
  labs(title = 'Wines Points Distribution - Points & Probability + Mean', x = 'X = Points', y = 'Y = Probability')
## Warning: Ignoring unknown parameters: template
ggplotly(wines_points_distribution)
# Explore countries and regions

# Countries wordcloud - interactive with mouse hover
country_names <- Corpus(VectorSource(tolower(data$country)))
country_names_frequencies = as.data.frame(as.matrix(DocumentTermMatrix(country_names, control = list(wordLengths = c(2, Inf)))))

countries <- colnames(country_names_frequencies)
frequencies <- colSums(country_names_frequencies)

countries_frequencies <- data.frame(countries, frequencies)

wordcloud2(countries_frequencies, backgroundColor = "white", minRotation = -pi/2, maxRotation = -pi/2, size = 1.5)
# Regions wordcloud - interactive with mouse hover
region_names <- Corpus(VectorSource(tolower(data$region_1)))
region_names_frequencies = as.data.frame(as.matrix(DocumentTermMatrix(region_names, control = list(wordLengths = c(2, Inf)))))

regions <- colnames(region_names_frequencies)
frequencies <- colSums(region_names_frequencies)

regions_frequencies <- data.frame(regions, frequencies)

wordcloud2(regions_frequencies, backgroundColor = "white", minRotation = -pi/2, maxRotation = -pi/2, size = 3.5)
# Display average number of points for each country

points_by_country_data = data %>%
  group_by(country) %>% 
  summarise(n = n(), avg_points = mean(points))  
## `summarise()` ungrouping output (override with `.groups` argument)
# US should be replaced with USA
points_by_country_data$country <- recode(points_by_country_data$country, !!!list('US' = 'USA'))

worldmap_init = map_data("world")

points_by_country_data <- merge(x = worldmap_init, 
                                y = points_by_country_data,
                                by.x = "region",
                                by.y = "country", 
                                all.x = TRUE) %>% arrange(desc(order))

map_plot <- ggplot(data = points_by_country_data,
                   aes(x = long, 
                       y = lat, 
                       group = group,
                       color = region)) +
  scale_fill_viridis_c(option = "plasma")+
  theme_minimal()+               
  geom_polygon(aes(fill = avg_points)) +
  labs(fill='Average nr. of points')+
  theme(legend.position = 'none')              
              

ggplotly(map_plot)